#load necessary libraries
library(xgboost)
library(caret)
train=readRDS("train.rds")
test=readRDS("test.rds")
print(train)
print(test)
#train:test -> 80%:20%
train_train=train[1:(0.8*dim(train)[1]),]
test_train=train[(0.8*dim(train)[1]+1):(dim(train)[1]),]

Use caret to find the best hyperparameters using 5-fold cv

#set up grid to choose the best values for the following parameters
control <-trainControl(method="cv", number=5)
xgb_grid = expand.grid(
nrounds = 10,
eta = c(0.1, 0.05, 0.01),
max_depth = c(2, 3, 4, 5, 6),
gamma = 0,
colsample_bytree=1,
min_child_weight=c(1, 2, 3, 4 ,5),
subsample=1)
xgb_caret <- train(x=train_train[-230], y=train_train[,230], method='xgbTree', trControl= control, tuneGrid=xgb_grid) 
There were missing values in resampled performance measures.
xgb_caret$bestTune

#From the run, the best tune parameters are max depth = 2, eta = 0.1 and min_child_weight = 1

train_labels <- train_train[,230]
# put our testing & training data into two seperates Dmatrixs objects
dtrain <- xgb.DMatrix(data = as.matrix(train_train[,-230]), label= train_labels)
dtest <- xgb.DMatrix(data = as.matrix(test_train[,-230]))
#Using the best parameters from the tune
parameters <-list(
        objective = "reg:linear",
        booster = "gbtree",
        eta=0.1, #default = 0.3
        gamma=0,
        max_depth=2, #default=6
        min_child_weight=1, #default=1
        subsample=1,
        colsample_bytree=1
)
#cross validation using the inbuild xgb.cv() to find the best no of rounds. 
set.seed(123)
xgbcv <- xgb.cv( params = parameters, data = dtrain, nrounds = 10, nfold = 5, showsd = T, stratified = T, print_every_n = 40, early_stopping_rounds = 10, maximize = F)
[1] train-rmse:10.378689+0.006923   test-rmse:10.378591+0.031576 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 10 rounds.

[10]    train-rmse:4.037595+0.002668    test-rmse:4.037670+0.029275 
#based on the best tune parameters and nrounds = 1032
xgb_mod <- xgb.train(data = dtrain, params= parameters, nrounds = 1032)
XGBpred <- predict(xgb_mod, dtest)
head(XGBpred)
[1] 12.10175 13.16844 11.86608 11.98074 11.95439 12.34753
predictions_XGB <-XGBpred #need to reverse the log to the real values
#evaluation of results
cor(predictions_XGB,test_train[,230])
[1] 0.9410821
rmse(test_train[,230],predictions_XGB)
[1] 0.1326849
#visualizing the results
plot(exp(predictions_XGB),exp(test_train[,230]),xlab="Predicted Label",ylab="Actual Label",main="Plot of Actual Against Predicted Labels")
lin.mod=lm(exp(test_train[,230])~exp(predictions_XGB))
pr.lm=predict(lin.mod)
lines(pr.lm~exp(predictions_XGB), col="blue", lwd=0.5)
lines(c(0,450000), c(0,450000))
legend("topleft", legend=c("fitted line", "45 degree line"),col=c("blue", "black"), lty=1, cex=0.8)

#view variable importance plot
#install.packages("Ckmeans.1d.dp")
library(Ckmeans.1d.dp) #required for ggplot clustering
mat <- xgb.importance (feature_names = colnames(train_train[,-230]),model = xgb_mod)
xgb.ggplot.importance(importance_matrix = mat[1:20], rel_to_first = TRUE)

LS0tDQp0aXRsZTogIlhHQm9vc3QiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cn0NCiNsb2FkIG5lY2Vzc2FyeSBsaWJyYXJpZXMNCmxpYnJhcnkoeGdib29zdCkNCmxpYnJhcnkoY2FyZXQpDQpgYGANCg0KYGBge3J9DQp0cmFpbj1yZWFkUkRTKCJ0cmFpbi5yZHMiKQ0KdGVzdD1yZWFkUkRTKCJ0ZXN0LnJkcyIpDQpwcmludCh0cmFpbikNCnByaW50KHRlc3QpDQoNCiN0cmFpbjp0ZXN0IC0+IDgwJToyMCUNCnRyYWluX3RyYWluPXRyYWluWzE6KDAuOCpkaW0odHJhaW4pWzFdKSxdDQp0ZXN0X3RyYWluPXRyYWluWygwLjgqZGltKHRyYWluKVsxXSsxKTooZGltKHRyYWluKVsxXSksXQ0KYGBgDQoNClVzZSBjYXJldCB0byBmaW5kIHRoZSBiZXN0IGh5cGVycGFyYW1ldGVycyB1c2luZyA1LWZvbGQgY3YNCmBgYHtyfQ0KI3NldCB1cCBncmlkIHRvIGNob29zZSB0aGUgYmVzdCB2YWx1ZXMgZm9yIHRoZSBmb2xsb3dpbmcgcGFyYW1ldGVycw0KDQpjb250cm9sIDwtdHJhaW5Db250cm9sKG1ldGhvZD0iY3YiLCBudW1iZXI9NSkNCg0KeGdiX2dyaWQgPSBleHBhbmQuZ3JpZCgNCm5yb3VuZHMgPSAxMCwNCmV0YSA9IGMoMC4xLCAwLjA1LCAwLjAxKSwNCm1heF9kZXB0aCA9IGMoMiwgMywgNCwgNSwgNiksDQpnYW1tYSA9IDAsDQpjb2xzYW1wbGVfYnl0cmVlPTEsDQptaW5fY2hpbGRfd2VpZ2h0PWMoMSwgMiwgMywgNCAsNSksDQpzdWJzYW1wbGU9MSkNCg0KeGdiX2NhcmV0IDwtIHRyYWluKHg9dHJhaW5fdHJhaW5bLTIzMF0sIHk9dHJhaW5fdHJhaW5bLDIzMF0sIG1ldGhvZD0neGdiVHJlZScsIHRyQ29udHJvbD0gY29udHJvbCwgdHVuZUdyaWQ9eGdiX2dyaWQpIA0KeGdiX2NhcmV0JGJlc3RUdW5lDQoNCmBgYA0KI0Zyb20gdGhlIHJ1biwgdGhlIGJlc3QgdHVuZSBwYXJhbWV0ZXJzIGFyZSBtYXggZGVwdGggPSAyLCBldGEgPSAwLjEgYW5kIG1pbl9jaGlsZF93ZWlnaHQgPSAgMQ0KDQpgYGB7cn0NCnRyYWluX2xhYmVscyA8LSB0cmFpbl90cmFpblssMjMwXQ0KDQojIHB1dCBvdXIgdGVzdGluZyAmIHRyYWluaW5nIGRhdGEgaW50byB0d28gc2VwZXJhdGVzIERtYXRyaXhzIG9iamVjdHMNCmR0cmFpbiA8LSB4Z2IuRE1hdHJpeChkYXRhID0gYXMubWF0cml4KHRyYWluX3RyYWluWywtMjMwXSksIGxhYmVsPSB0cmFpbl9sYWJlbHMpDQpkdGVzdCA8LSB4Z2IuRE1hdHJpeChkYXRhID0gYXMubWF0cml4KHRlc3RfdHJhaW5bLC0yMzBdKSkNCmBgYA0KDQpgYGB7cn0NCiNVc2luZyB0aGUgYmVzdCBwYXJhbWV0ZXJzIGZyb20gdGhlIHR1bmUNCnBhcmFtZXRlcnMgPC1saXN0KA0KICAgICAgICBvYmplY3RpdmUgPSAicmVnOmxpbmVhciIsDQogICAgICAgIGJvb3N0ZXIgPSAiZ2J0cmVlIiwNCiAgICAgICAgZXRhPTAuMSwgI2RlZmF1bHQgPSAwLjMNCiAgICAgICAgZ2FtbWE9MCwNCiAgICAgICAgbWF4X2RlcHRoPTIsICNkZWZhdWx0PTYNCiAgICAgICAgbWluX2NoaWxkX3dlaWdodD0xLCAjZGVmYXVsdD0xDQogICAgICAgIHN1YnNhbXBsZT0xLA0KICAgICAgICBjb2xzYW1wbGVfYnl0cmVlPTENCikNCmBgYA0KDQpgYGB7cn0NCiNjcm9zcyB2YWxpZGF0aW9uIHVzaW5nIHRoZSBpbmJ1aWxkIHhnYi5jdigpIHRvIGZpbmQgdGhlIGJlc3Qgbm8gb2Ygcm91bmRzLiANCnNldC5zZWVkKDEyMykNCnhnYmN2IDwtIHhnYi5jdiggcGFyYW1zID0gcGFyYW1ldGVycywgZGF0YSA9IGR0cmFpbiwgbnJvdW5kcyA9IDEwLCBuZm9sZCA9IDUsIHNob3dzZCA9IFQsIHN0cmF0aWZpZWQgPSBULCBwcmludF9ldmVyeV9uID0gNDAsIGVhcmx5X3N0b3BwaW5nX3JvdW5kcyA9IDEwLCBtYXhpbWl6ZSA9IEYpDQpgYGANCg0KYGBge3J9DQojYmFzZWQgb24gdGhlIGJlc3QgdHVuZSBwYXJhbWV0ZXJzIGFuZCBucm91bmRzID0gMTAzMg0KeGdiX21vZCA8LSB4Z2IudHJhaW4oZGF0YSA9IGR0cmFpbiwgcGFyYW1zPSBwYXJhbWV0ZXJzLCBucm91bmRzID0gMTAzMikNCg0KYGBgDQoNCmBgYHtyfQ0KWEdCcHJlZCA8LSBwcmVkaWN0KHhnYl9tb2QsIGR0ZXN0KQ0KaGVhZChYR0JwcmVkKQ0KDQpwcmVkaWN0aW9uc19YR0IgPC1YR0JwcmVkICNuZWVkIHRvIHJldmVyc2UgdGhlIGxvZyB0byB0aGUgcmVhbCB2YWx1ZXMNCg0KYGBgDQoNCmBgYHtyfQ0KI2V2YWx1YXRpb24gb2YgcmVzdWx0cw0KY29yKHByZWRpY3Rpb25zX1hHQix0ZXN0X3RyYWluWywyMzBdKQ0Kcm1zZSh0ZXN0X3RyYWluWywyMzBdLHByZWRpY3Rpb25zX1hHQikNCmBgYA0KDQpgYGB7cn0NCiN2aXN1YWxpemluZyB0aGUgcmVzdWx0cw0KcGxvdChleHAocHJlZGljdGlvbnNfWEdCKSxleHAodGVzdF90cmFpblssMjMwXSkseGxhYj0iUHJlZGljdGVkIExhYmVsIix5bGFiPSJBY3R1YWwgTGFiZWwiLG1haW49IlBsb3Qgb2YgQWN0dWFsIEFnYWluc3QgUHJlZGljdGVkIExhYmVscyIpDQpsaW4ubW9kPWxtKGV4cCh0ZXN0X3RyYWluWywyMzBdKX5leHAocHJlZGljdGlvbnNfWEdCKSkNCnByLmxtPXByZWRpY3QobGluLm1vZCkNCmxpbmVzKHByLmxtfmV4cChwcmVkaWN0aW9uc19YR0IpLCBjb2w9ImJsdWUiLCBsd2Q9MC41KQ0KbGluZXMoYygwLDQ1MDAwMCksIGMoMCw0NTAwMDApKQ0KDQpsZWdlbmQoInRvcGxlZnQiLCBsZWdlbmQ9YygiZml0dGVkIGxpbmUiLCAiNDUgZGVncmVlIGxpbmUiKSxjb2w9YygiYmx1ZSIsICJibGFjayIpLCBsdHk9MSwgY2V4PTAuOCkNCmBgYA0KDQpgYGB7ciwgb3V0LndpZHRoPSIxMDAlIn0NCiN2aWV3IHZhcmlhYmxlIGltcG9ydGFuY2UgcGxvdA0KI2luc3RhbGwucGFja2FnZXMoIkNrbWVhbnMuMWQuZHAiKQ0KbGlicmFyeShDa21lYW5zLjFkLmRwKSAjcmVxdWlyZWQgZm9yIGdncGxvdCBjbHVzdGVyaW5nDQptYXQgPC0geGdiLmltcG9ydGFuY2UgKGZlYXR1cmVfbmFtZXMgPSBjb2xuYW1lcyh0cmFpbl90cmFpblssLTIzMF0pLG1vZGVsID0geGdiX21vZCkNCnhnYi5nZ3Bsb3QuaW1wb3J0YW5jZShpbXBvcnRhbmNlX21hdHJpeCA9IG1hdFsxOjIwXSwgcmVsX3RvX2ZpcnN0ID0gVFJVRSkNCmBgYA0K